In [1]:
import scanpy as sc, anndata as ad, numpy as np, pandas as pd
from scipy import sparse
from anndata import AnnData
import warnings
import socket
from matplotlib import pylab
import sys
import yaml
import numbers
import os
import plotly
import matplotlib.pyplot as plt
import scvelo as scv
from tqdm.notebook import tqdm
import itertools
import random
import seaborn as sns

import rpy2
import rpy2.robjects as ro

warnings.filterwarnings('ignore')
In [2]:
%load_ext rpy2.ipython
In [3]:
%matplotlib inline
In [4]:
sc.settings.verbosity = 3         # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=50, facecolor='white')
pylab.rcParams['figure.figsize'] = (10, 10)
scanpy==1.8.1 anndata==0.7.6 umap==0.4.6 numpy==1.20.2 scipy==1.6.3 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.13.1 python-igraph==0.9.8 louvain==0.7.1 pynndescent==0.5.5
In [5]:
plotly.offline.init_notebook_mode()
In [6]:
%matplotlib inline
In [7]:
sc.settings.verbosity = 3         # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=50, facecolor='white')
pylab.rcParams['figure.figsize'] = (9, 9)
scanpy==1.8.1 anndata==0.7.6 umap==0.4.6 numpy==1.20.2 scipy==1.6.3 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.13.1 python-igraph==0.9.8 louvain==0.7.1 pynndescent==0.5.5

Configure paths¶

In [8]:
hostRoot = "-".join(socket.gethostname().split('-')[0:2])

with open(os.path.expanduser('~')+"/paths_config.yaml", 'r') as f:
    paths = yaml.load(f, Loader=yaml.FullLoader)

#indir=paths["paths"]["indir"][hostRoot]
outdir="./outdir/"
FinaLeaf="/Progenitors"
#projectBaseDir=paths["paths"]["projectBaseDir"][hostRoot]

Cells selection and pBulking¶

In [9]:
adata = sc.read_h5ad(outdir+FinaLeaf+"/4B_Progenitors_DA.h5ad")
adataraw = sc.read_h5ad(outdir+"/1_polaroid_mint.h5ad")[adata.obs_names]
adataraw.obs = adata.obs
adata = adataraw
In [10]:
sc.pp.normalize_total(adata)
normalizing counts per cell
    finished (0:00:00)
In [11]:
adata.obs
Out[11]:
dataset organoid region type type_region regionContrast n_genes_by_counts log1p_n_genes_by_counts total_counts log1p_total_counts ... S_score G2M_score phase Progenitor subLeiden umap_density_type umap_density_region umap_density_organoid DiffAbundant leiden_partition_final
AAACGAATCCCTTGTG-1_control3_piece2 control3_piece2 control3 piece2 control control_piece2 control 5025 8.522380 20194.0 9.913190 ... -0.028440 -0.286710 G1 NonCyclingProgenitor 8 0.412203 0.284801 0.196506 0 not_enriched
AAGTACCCAAAGCTCT-1_control3_piece2 control3_piece2 control3 piece2 control control_piece2 control 1987 7.594884 3498.0 8.160233 ... -0.220022 -0.168710 G1 NonCyclingProgenitor 3 0.342266 0.389470 0.119090 0 not_enriched
AAGTACCCACTGGACC-1_control3_piece2 control3_piece2 control3 piece2 control control_piece2 control 2253 7.720462 3527.0 8.168487 ... -0.125708 -0.218010 G1 NonCyclingProgenitor 3 0.689598 0.848694 0.214638 0 not_enriched
AAGTACCTCCAATCCC-1_control3_piece2 control3_piece2 control3 piece2 control control_piece2 control 3634 8.198364 13578.0 9.516280 ... -0.103682 -0.110184 G1 NonCyclingProgenitor 1 0.731295 0.616268 0.568335 0 not_enriched
AATAGAGAGATTAGTG-1_control3_piece2 control3_piece2 control3 piece2 control control_piece2 control 5001 8.517593 15610.0 9.655731 ... -0.149420 -0.256755 G1 NonCyclingProgenitor 3 0.764932 0.987658 0.220167 0 not_enriched
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
TTGTTGTAGGTCTGGA-1_polaroid3_proximal polaroid3_proximal polaroid3 proximal polaroid polaroid_proximal proximal 4526 8.417815 13515.0 9.511629 ... -0.051561 -0.075598 G1 NonCyclingProgenitor 4 0.938423 0.915668 0.550702 1 enriched
TTTCACAGTCTGTGTA-1_polaroid3_proximal polaroid3_proximal polaroid3 proximal polaroid polaroid_proximal proximal 3570 8.180601 9301.0 9.137984 ... -0.189825 -0.170289 G1 NonCyclingProgenitor 2 0.328438 0.249450 0.296778 1 enriched
TTTCAGTCAATTCACG-1_polaroid3_proximal polaroid3_proximal polaroid3 proximal polaroid polaroid_proximal proximal 1766 7.477038 2722.0 7.909490 ... -0.181366 -0.118886 G1 NonCyclingProgenitor 0 0.642922 0.689690 0.197166 1 enriched
TTTCCTCAGTCCTACA-1_polaroid3_proximal polaroid3_proximal polaroid3 proximal polaroid polaroid_proximal proximal 2687 7.896553 7148.0 8.874728 ... -0.091811 -0.112348 G1 NonCyclingProgenitor 6 0.676237 0.641296 0.702468 1 enriched
TTTGATCTCCGTTTCG-1_polaroid3_proximal polaroid3_proximal polaroid3 proximal polaroid polaroid_proximal proximal 3964 8.285261 13345.0 9.498972 ... -0.122932 -0.105680 G1 NonCyclingProgenitor 1 0.387854 0.212226 0.231424 0 not_enriched

2073 rows × 30 columns

In [12]:
adata.write_h5ad(outdir+FinaLeaf+"/5B_Progenitors_preBulk.h5ad")

by region¶

In [13]:
groupingCovariate = "region"
PseudooReplicates_per_group = 10
print("Pbulking with target of "+str(PseudooReplicates_per_group)+" PRs will result in following cells per PR")
adata.obs.groupby(groupingCovariate).size() / PseudooReplicates_per_group
Pbulking with target of 10 PRs will result in following cells per PR
Out[13]:
region
distal       15.7
medial       10.3
piece1       21.9
piece2       24.4
piece3       23.5
proximal    111.5
dtype: float64
In [14]:
total = pd.DataFrame(index = adata.var_names)
total_metadata = pd.DataFrame(index= ["_".join(Rep) for Rep in  list(itertools.product(adata.obs[groupingCovariate].cat.categories.tolist(), [str(r)  for r in list(range(PseudooReplicates_per_group))]))])
for group in adata.obs[groupingCovariate].cat.categories:
    groupAdata = adata[adata.obs[groupingCovariate] == group]
    
    group_cells = groupAdata.obs_names.tolist()
    random.Random(4).shuffle(group_cells)
    
    metaCellslist=[group_cells[i::PseudooReplicates_per_group] for i in range(PseudooReplicates_per_group)]
    
    for partition in enumerate(metaCellslist):
        
        total['{}_{}'.format(group, partition[0])] = adata[partition[1]].X.sum(axis = 0).A1
    
        total_metadata.loc['{}_{}'.format(group, partition[0]),"group"] = group
        total_metadata.loc['{}_{}'.format(group, partition[0]),"pseudoreplicate"] = partition[0]
        total_metadata.loc['{}_{}'.format(group, partition[0]),"number_of_cell"] = int(len(partition[1]))
    
    #With this line we propagate - whenever possible -  obs to aggregated pReplicate
    for obsMD in [obsMD for obsMD in groupAdata.obs.columns.tolist() if len(groupAdata.obs[obsMD].unique()) == 1 and obsMD != groupingCovariate]:
        total_metadata.loc[["_".join(l) for l in list(itertools.product([group],[str(r)  for r in list(range(PseudooReplicates_per_group))]))], obsMD ] = adata.obs.loc[adata.obs[groupingCovariate] == group,obsMD][0]
        
        
total_metadata = total_metadata.dropna(axis = 1)



adata_bulk = sc.AnnData(total.transpose()).copy()
adata_bulk.var = adata.var.copy()
adata_bulk.obs = pd.concat([adata_bulk.obs, total_metadata], axis = 1)


adata_bulk.obs["group"] =adata_bulk.obs["group"].astype("category")
with open("./colorMap.yaml", 'r') as f:
    colorMap = yaml.load(f, Loader=yaml.FullLoader)["uns_colors"]
colorMap
adata_bulk.uns["group_colors"] = [colorMap[i]["color"] for i in adata_bulk.obs["group"].cat.categories]

adata_bulk.obs["regionContrast"] = adata_bulk.obs["regionContrast"].astype("category")
adata_bulk.uns["regionContrast_colors"] = [colorMap[i]["color"] for i in adata_bulk.obs["regionContrast"].cat.categories]



adata_bulk.write_h5ad(outdir+FinaLeaf+"/5B_Progenitors_pBulk.bySegment."+str(PseudooReplicates_per_group)+"PRs.h5ad")
total.to_csv(outdir+FinaLeaf+"/5B_Progenitors_pBulk.bySegment."+str(PseudooReplicates_per_group)+"PRs.tsv", sep="\t")
... storing 'type' as categorical
... storing 'type_region' as categorical
... storing 'is.Stressed' as categorical
... storing 'phase' as categorical
... storing 'Progenitor' as categorical
In [ ]: